import pandas as pdimport numpy as npfrom sklearn.model_selection import train_test_splitfrom sklearn.ensemble import RandomForestRegressorfrom sklearn.linear_model import LogisticRegressionfrom sklearn.preprocessing import StandardScalerimport plotly.express as pximport plotly.graph_objects as go# Load the datasetdata = pd.read_csv("dynamic_pricing.csv")data.head()# Block 2: Exploratory Data Analysis# Descriptive statisticsdata.describe()# Scatter plot: Expected Ride Duration vs. Historical Cost of Ridefig = px.scatter(data, x='Expected_Ride_Duration', y='Historical_Cost_of_Ride', title='Expected Ride Duration vs. Historical Cost of Ride', trendline='ols')fig.show()# Box plot: Historical Cost of Ride Distribution by Vehicle Typefig = px.box(data, x='Vehicle_Type', y='Historical_Cost_of_Ride', title='Historical Cost of Ride Distribution by Vehicle Type')fig.show()# Correlation matrixcorr_matrix = data.corr()fig = go.Figure(data=go.Heatmap(z=corr_matrix.values, x=corr_matrix.columns, y=corr_matrix.columns, colorscale='Viridis'))fig.update_layout(title='Correlation Matrix')fig.show()# Block 3: Implementing Dynamic Pricing Strategy# Calculate demand_multiplierhigh_demand_percentile = 75low_demand_percentile = 25data['demand_multiplier'] = np.where( data['Number_of_Riders'] > np.percentile(data['Number_of_Riders'], high_demand_percentile), data['Number_of_Riders'] / np.percentile(data['Number_of_Riders'], high_demand_percentile), data['Number_of_Riders'] / np.percentile(data['Number_of_Riders'], low_demand_percentile))# Calculate supply_multiplierhigh_supply_percentile = 75low_supply_percentile = 25data['supply_multiplier'] = np.where( data['Number_of_Drivers'] > np.percentile(data['Number_of_Drivers'], low_supply_percentile), np.percentile(data['Number_of_Drivers'], high_supply_percentile) / data['Number_of_Drivers'], np.percentile(data['Number_of_Drivers'], low_supply_percentile) / data['Number_of_Drivers'])# Define price adjustment factorsdemand_threshold_high = 1.2demand_threshold_low = 0.8supply_threshold_high = 0.8supply_threshold_low = 1.2# Calculate adjusted_ride_costdata['adjusted_ride_cost'] = data['Historical_Cost_of_Ride'] * ( np.maximum(data['demand_multiplier'], demand_threshold_low) * np.maximum(data['supply_multiplier'], supply_threshold_high))# Block 4: Profitability Analysis# Calculate profit percentagedata['profit_percentage'] = ((data['adjusted_ride_cost'] - data['Historical_Cost_of_Ride']) / data['Historical_Cost_of_Ride']) * 100# Identify profitable and loss ridesprofitable_rides = data[data['profit_percentage'] > 0]loss_rides = data[data['profit_percentage'] < 0]# Create donut chart for profitability distributionprofitable_count = len(profitable_rides)loss_count = len(loss_rides)labels = ['Profitable Rides', 'Loss Rides']values = [profitable_count, loss_count]fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.4)])fig.update_layout(title='Profitability of Rides (Dynamic Pricing vs. Historical Pricing)')fig.show()# Scatter plot: Expected Ride Duration vs. Adjusted Cost of Ridefig = px.scatter(data, x='Expected_Ride_Duration', y='adjusted_ride_cost', title='Expected Ride Duration vs. Cost of Ride', trendline='ols')fig.show()# Block 5: Data Preprocessing for Machine Learningfrom sklearn.preprocessing import StandardScalerdef data_preprocessing_pipeline(data): numeric_features = data.select_dtypes(include=['float', 'int']).columns categorical_features = data.select_dtypes(include=['object']).columns # Handle missing values in numeric features data[numeric_features] = data[numeric_features].fillna(data[numeric_features].mean()) # Handle outliers in numeric features for feature in numeric_features: Q1 = data[feature].quantile(0.25) Q3 = data[feature].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - (1.5 * IQR) upper_bound = Q3 + (1.5 * IQR) data[feature] = np.where((data[feature] < lower_bound) | (data[feature] > upper_bound), data[feature].mean(), data[feature]) # Handle missing values in categorical features data[categorical_features] = data[categorical_features].fillna(data[categorical_features].mode().iloc[0]) return data# Apply preprocessingdata = data_preprocessing_pipeline(data)# Convert Vehicle_Type to numericdata["Vehicle_Type"] = data["Vehicle_Type"].map({"Premium": 1, "Economy": 0})# Block 6: Training a Machine Learning Modelfrom sklearn.model_selection import train_test_splitfrom sklearn.ensemble import RandomForestRegressor# Prepare features and targetX = np.array(data[["Number_of_Riders", "Number_of_Drivers", "Vehicle_Type", "Expected_Ride_Duration"]])y = np.array(data["adjusted_ride_cost"])# Split the dataX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)# Train the modelmodel = RandomForestRegressor()model.fit(X_train, y_train)# Block 7: Model Prediction Functiondef get_vehicle_type_numeric(vehicle_type): vehicle_type_mapping = {"Premium": 1, "Economy": 0} return vehicle_type_mapping.get(vehicle_type)def predict_price(number_of_riders, number_of_drivers, vehicle_type, Expected_Ride_Duration): vehicle_type_numeric = get_vehicle_type_numeric(vehicle_type) if vehicle_type_numeric is None: raise ValueError("Invalid vehicle type") input_data = np.array([[number_of_riders, number_of_drivers, vehicle_type_numeric, Expected_Ride_Duration]]) return model.predict(input_data)# Example predictionuser_number_of_riders = 50user_number_of_drivers = 25user_vehicle_type = "Economy"Expected_Ride_Duration = 30predicted_price = predict_price(user_number_of_riders, user_number_of_drivers, user_vehicle_type, Expected_Ride_Duration)print("Predicted price:", predicted_price)# Block 8: Model Evaluation Visualizationy_pred = model.predict(X_test)fig = go.Figure()fig.add_trace(go.Scatter( x=y_test, y=y_pred, mode='markers', name='Actual vs Predicted'))fig.add_trace(go.Scatter( x=[min(y_test), max(y_test)], y=[min(y_test), max(y_test)], mode='lines', name='Ideal', line=dict(color='red', dash='dash')))fig.update_layout( title='Actual vs Predicted Values', xaxis_title='Actual Values', yaxis_title='Predicted Values', showlegend=True)fig.show()# Add more factors to the dynamic pricing calculationdata['time_multiplier'] = data['Time_of_Booking'].map({'Night': 1.2, 'Evening': 1.1, 'Afternoon': 1.0, 'Morning': 0.9})data['loyalty_multiplier'] = data['Customer_Loyalty_Status'].map({'Gold': 0.9, 'Silver': 0.95, 'Regular': 1.0})# Modify the adjusted_ride_cost calculationdata['adjusted_ride_cost'] = data['Historical_Cost_of_Ride'] * ( np.maximum(data['demand_multiplier'], demand_threshold_low) * np.maximum(data['supply_multiplier'], supply_threshold_high) * data['time_multiplier'] * data['loyalty_multiplier'])from sklearn.linear_model import LogisticRegressionfrom sklearn.preprocessing import StandardScaler# Prepare features for risk assessmentrisk_features = ['Number_of_Riders', 'Number_of_Drivers', 'Number_of_Past_Rides', 'Average_Ratings']X_risk = data[risk_features]y_risk = (data['profit_percentage'] < 0).astype(int) # 1 if loss, 0 if profit# Split data for risk modelX_risk_train, X_risk_test, y_risk_train, y_risk_test = train_test_split(X_risk, y_risk, test_size=0.2, random_state=42)# Scale featuresscaler = StandardScaler()X_risk_train_scaled = scaler.fit_transform(X_risk_train)X_risk_test_scaled = scaler.transform(X_risk_test)# Train logistic regression modelrisk_model = LogisticRegression()risk_model.fit(X_risk_train_scaled, y_risk_train)# Predict risk scoresrisk_scores = risk_model.predict_proba(X_risk_test_scaled)[:, 1]# Evaluate risk modelfrom sklearn.metrics import roc_auc_scorerisk_auc = roc_auc_score(y_risk_test, risk_scores)print(f"Risk Assessment Model AUC: {risk_auc}")def analyze_engagement(data): initial_rides = len(data) engaged_rides = len(data[data['adjusted_ride_cost'] <= data['Historical_Cost_of_Ride'] * 1.1]) # 10% price increase tolerance engagement_rate = engaged_rides / initial_rides engagement_increase = (engagement_rate - 1) * 100 return engagement_increaseengagement_increase = analyze_engagement(data)print(f"Engagement increase: {engagement_increase:.2f}%")def track_fraud_reduction(risk_scores, threshold=0.5): initial_fraud_rate = 0.1 # Assume initial fraud rate of 10% new_fraud_rate = sum(risk_scores > threshold) / len(risk_scores) fraud_reduction = (initial_fraud_rate - new_fraud_rate) / initial_fraud_rate * 100 return fraud_reductiondef track_safety_improvement(data): initial_safety = data['Average_Ratings'].mean() safety_threshold = 4.5 improved_safety = len(data[data['Average_Ratings'] >= safety_threshold]) / len(data) safety_improvement = (improved_safety - initial_safety) / initial_safety * 100 return safety_improvementfraud_reduction = track_fraud_reduction(risk_scores)safety_improvement = track_safety_improvement(data)print(f"Fraud reduction: {fraud_reduction:.2f}%")print(f"Safety improvement: {safety_improvement:.2f}%")import plotly.graph_objects as go# Create a bar chart for key metricsfig = go.Figure(data=[ go.Bar(name='Engagement Increase', x=['Engagement'], y=[engagement_increase]), go.Bar(name='Fraud Reduction', x=['Fraud'], y=[fraud_reduction]), go.Bar(name='Safety Improvement', x=['Safety'], y=[safety_improvement])])fig.update_layout(title='Key Performance Metrics', barmode='group')fig.show()import numpy as np# Calculate demand_multiplierhigh_demand_percentile = 75low_demand_percentile = 25data['demand_multiplier'] = np.where( data['Number_of_Riders'] > np.percentile(data['Number_of_Riders'], high_demand_percentile), data['Number_of_Riders'] / np.percentile(data['Number_of_Riders'], high_demand_percentile), data['Number_of_Riders'] / np.percentile(data['Number_of_Riders'], low_demand_percentile))# Calculate supply_multiplierhigh_supply_percentile = 75low_supply_percentile = 25data['supply_multiplier'] = np.where( data['Number_of_Drivers'] > np.percentile(data['Number_of_Drivers'], low_supply_percentile), np.percentile(data['Number_of_Drivers'], high_supply_percentile) / data['Number_of_Drivers'], np.percentile(data['Number_of_Drivers'], low_supply_percentile) / data['Number_of_Drivers'])# Define price adjustment factorsdemand_threshold_low = 0.8supply_threshold_high = 0.8# Calculate adjusted_ride_costdata['adjusted_ride_cost'] = data['Historical_Cost_of_Ride'] * ( np.maximum(data['demand_multiplier'], demand_threshold_low) * np.maximum(data['supply_multiplier'], supply_threshold_high))print(data.columns)import plotly.graph_objects as gofig = go.Figure(data=[go.Scatter3d( x=data['Expected_Ride_Duration'], y=data['Number_of_Riders'], z=data['adjusted_ride_cost'], mode='markers', marker=dict( size=5, color=data['adjusted_ride_cost'], colorscale='Viridis', opacity=0.8 ), text=data['Vehicle_Type'], hoverinfo='text')])fig.show()